
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import os
import pandas as pd


def main():
    LOGGER.info('SCRIPT: b_finding_suppliers.py')
    # Given the list of factset ids for oems that we collected, export a csv with info about their suppliers in factset
    df_suppliers = get_fctids_of_suppliers_of_OEMs()
    # Add name and home region of suppliers from factset
    df_suppliers = add_fct_name_region(df_suppliers)
    df_suppliers = get_bvdids_of_suppliers(df_suppliers)
    df_suppliers.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_ids.csv', index=False)
    collect_naics_code(df_suppliers)
    exploring_naics_code()
    return


def get_fctids_of_suppliers_of_OEMs():
    LOGGER.info('get_fctids_of_suppliers_of_OEMs')
    # Import factset ids of oems
    OEMs = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/OEMs_factset.csv')
    OEMs = OEMs[OEMs['fctid'].notnull()]
    LOGGER.info('Producers: Nbr factset ids: {}'.format(OEMs['fctid'].nunique()))
    # Import supply-chain info from factset
    dataSC = pd.read_csv(PATHS.factset / 'SUPPLY_CHAIN.csv')
    df_suppliers = dataSC[dataSC['CUSTOMER_ID'].isin(OEMs['fctid'].tolist())]['SUPPLIER_ID'].drop_duplicates()
    LOGGER.info('Suppliers: Nbr factset ids: {}'.format(df_suppliers.nunique()))
    # The 179 auto producers in factset have 3131 suppliers in factset
    # Some of the firms listed as suppliers are in fact also OEMs. Let's drop those
    df_suppliers = df_suppliers[~df_suppliers.isin(OEMs['fctid'])]
    LOGGER.info('Suppliers: Nbr factset ids: {}'.format(df_suppliers.nunique()))
    # 3080 factset ids of suppliers after dropping theose suppliers that were oems
    return df_suppliers


def add_fct_name_region(df_suppliers):
    Factset = pd.read_csv(PATHS.factset / 'COMPANY_withISIN.csv', sep=',', usecols=['NAME', 'ID', 'HOME_REGION'])
    Factset = Factset.drop_duplicates().rename(columns={'NAME': 'fctname', 'ID': 'fctid', 'HOME_REGION': 'fct_home_region'})
    Factset = Factset[Factset['fctid'].isin(df_suppliers)]
    supplier_name = Factset.groupby('fctid')['fctname'].agg(lambda x: list(x.unique())[0])
    supplier_region = Factset[Factset['fct_home_region'] != 'None'].groupby('fctid')['fct_home_region'].agg(lambda x: ','.join([str(i) for i in list(x.unique())]))
    df_suppliers = pd.concat([supplier_name, supplier_region], axis=1).reset_index()
    return df_suppliers


def get_bvdids_of_suppliers(df_suppliers):
    LOGGER.info('get_bvdids_of_suppliers')
    coltokeep = ['bvdID', 'fctNAME', 'fctID', 'fctNAMEmatch', 'bvdNAMEmatch']
    # Import Orbis-Factset correspondence: we created this correspondence by string matching of the company names
    FactsetOrbisLink = pd.read_csv(PATHS.factset / 'OrbisFactsetCorrespondence.csv', usecols=coltokeep)
    FactsetOrbisLink = FactsetOrbisLink.drop_duplicates().rename(columns={'bvdID': 'bvdid'})
    # Get bvdids of suppliers
    # How many of these suppliers have I matched?
    FactsetOrbisLink = FactsetOrbisLink[FactsetOrbisLink['fctID'].isin(df_suppliers['fctid'])]
    percentage = round(100*FactsetOrbisLink['fctID'].nunique()/df_suppliers['fctid'].nunique())
    print('Suppliers: Nbr factset ids with an Orbis match: {} ({}%)'.format(FactsetOrbisLink['fctID'].nunique(), percentage))
    print('Suppliers: Nbr bvdids: {}'.format(FactsetOrbisLink['bvdid'].nunique()))
    FactsetOrbisLink = FactsetOrbisLink[['fctID', 'bvdNAMEmatch', 'bvdid']].rename(columns={'fctID': 'fctid'}).drop_duplicates()
    df_suppliers = df_suppliers.merge(FactsetOrbisLink, on='fctid', how='left')
    # Add orbis region
    supplier_region_orbis = df_suppliers.groupby('fctid')['bvdid'].agg(lambda x: ','.join(list(set([str(i)[:2] for i in list(x.unique())])))).rename('orbis_region')
    df_suppliers = df_suppliers.merge(supplier_region_orbis.reset_index(), on='fctid', how='left')
    df_suppliers['bvdNAMEmatch'].fillna('NOT_MATCHED', inplace=True)
    df_suppliers['bvdid'].fillna('NOT_MATCHED', inplace=True)
    return df_suppliers


def collect_naics_code(df_suppliers):
    LOGGER.info('collect_naics_code')
    df_suppliers = df_suppliers[df_suppliers['bvdid'] != 'NOT_MATCHED']
    list_countries = df_suppliers['bvdid'].str[:2].unique()
    suppliers_naics = []
    # list_col = ['BvD ID number', 'NAICS, Primary code(s)', 'NACE Rev. 2 main section', 'NACE Rev. 2, Core code (4 digits)', 'US SIC, Core code (3 digits)', 'US SIC core code, text description']
    list_col = ['BvD ID number', 'NAICS, Primary code(s)']
    co = list_countries[0]
    for co in list_countries:
        file = 'Industry_classifications-{}.csv'.format(co)
        if file in os.listdir(PATHS.orbis / '2018/Industry_classifications/by_country/csv/'):
            # LOGGER.info('---- Starting File {}'.format(file))
            for i, datachunk in enumerate(pd.read_csv(PATHS.orbis / '2018/Industry_classifications/by_country/csv' / file, sep='\t', usecols=list_col, chunksize=1000000)):
                list_bvdids_in_country = df_suppliers[df_suppliers['bvdid'].str[:2] == co]['bvdid'].tolist()
                datachunk = datachunk[datachunk['BvD ID number'].isin(list_bvdids_in_country)].drop_duplicates()
                suppliers_naics.append(datachunk)
    suppliers_naics = pd.concat(suppliers_naics, sort=True).reset_index().drop(columns='index')
    NAICSdescription = pd.read_csv(PATHS.other / 'NAICS_Codes.csv')
    NAICSdescription['Title'] = NAICSdescription['Title'].apply(lambda x: x.strip()[:-1] if x.strip()[-1] == 'T' else x.strip())
    suppliers_naics = suppliers_naics.merge(NAICSdescription, left_on='NAICS, Primary code(s)', right_on='Code', how='left')
    suppliers_naics = suppliers_naics[['BvD ID number', 'NAICS, Primary code(s)', 'Title']]
    suppliers_naics = suppliers_naics[suppliers_naics['NAICS, Primary code(s)'].notnull()]
    suppliers_naics.to_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_naics.csv', index=False)
    LOGGER.info('Saved suppliers_naics.csv')
    return suppliers_naics


def exploring_naics_code():
    suppliers_naics = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_naics.csv')
    dfNAICS = suppliers_naics[['BvD ID number', 'NAICS, Primary code(s)']].drop_duplicates()
    dfNAICS = dfNAICS[dfNAICS['NAICS, Primary code(s)'].notnull()]
    NAICSdescription = pd.read_csv(PATHS.other / 'NAICS_Codes.csv')
    NAICSdescription.index = NAICSdescription['Code']
    NAICSdescription_dict = NAICSdescription['Title'].to_dict()
    dfNAICS['Description'] = dfNAICS['NAICS, Primary code(s)'].apply(lambda x: NAICSdescription_dict[x] if x in NAICSdescription_dict else '')
    dfNAICS['Description'].value_counts()
    dfNAICS['First'] = dfNAICS['NAICS, Primary code(s)'].astype(str).str[0]
    dfNAICS['First'].value_counts()
    dfNAICS['Two'] = dfNAICS['NAICS, Primary code(s)'].astype(str).str[:2]
    dfNAICS['Two'].value_counts()
    dfNAICS['Three'] = dfNAICS['NAICS, Primary code(s)'].astype(str).str[:3]
    dfNAICS['Three'].value_counts()
    PatstatOrbis = pd.read_csv(PATHS.dropbox / 'Data_outputted/PatstatOrbis/Patstat_bvdid_GUO50_names_counts.csv')
    dfNAICS = dfNAICS.merge(PatstatOrbis[['bvdid', 'NAME', 'FamCount']], left_on='BvD ID number', right_on='bvdid', how='left')
    LOGGER.info('Top 2-digit Primary NAICS codes:  (numbers represent the number of firms with particular codes)')
    LOGGER.info(dfNAICS['Two'].value_counts().iloc[:10])
    return

